# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
path_dir = os.path.dirname(os.getcwd())
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
plotly.offline.init_notebook_mode()
path_dir = os.path.join(os.path.dirname(os.getcwd()), 'datasets', 'customer_subscription')
os.listdir(path_dir)
['product_info.csv', 'customer_cases.csv', 'customer_product.csv', 'customer_info.csv']
# loading data
customer_cases = pd.read_csv(os.path.join(path_dir, "customer_cases.csv"))
# parsing dates
customer_cases['date_time'] = pd.to_datetime(customer_cases['date_time'])
customer_cases.head()
| Unnamed: 0 | case_id | date_time | customer_id | channel | reason | |
|---|---|---|---|---|---|---|
| 0 | 1 | CC101 | 2017-01-01 10:32:03 | C2448 | phone | signup |
| 1 | 2 | CC102 | 2017-01-01 11:35:47 | C2449 | phone | signup |
| 2 | 3 | CC103 | 2017-01-01 11:37:09 | C2450 | phone | signup |
| 3 | 4 | CC104 | 2017-01-01 13:28:14 | C2451 | phone | signup |
| 4 | 5 | CC105 | 2017-01-01 13:52:22 | C2452 | phone | signup |
# loading data
customer_product = pd.read_csv(os.path.join(path_dir, "customer_product.csv"))
# parsing dates
customer_product['signup_date_time'] = pd.to_datetime(customer_product['signup_date_time'])
customer_product['cancel_date_time'] = pd.to_datetime(customer_product['cancel_date_time'])
customer_product.head()
| Unnamed: 0 | customer_id | product | signup_date_time | cancel_date_time | |
|---|---|---|---|---|---|
| 0 | 1 | C2448 | prd_1 | 2017-01-01 10:35:09 | NaT |
| 1 | 2 | C2449 | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 |
| 2 | 3 | C2450 | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 |
| 3 | 4 | C2451 | prd_2 | 2017-01-01 13:32:08 | NaT |
| 4 | 5 | C2452 | prd_1 | 2017-01-01 13:57:30 | 2021-06-28 18:06:01 |
# CHECK: 1 product per customer
customer_product.groupby(['customer_id']).agg({'product': 'nunique'}).max()
product 1 dtype: int64
# merging data
df = customer_cases.merge(customer_product, on=['customer_id'], how='left')\
.drop(['Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)
df.head()
| case_id | date_time | customer_id | channel | reason | product | signup_date_time | cancel_date_time | |
|---|---|---|---|---|---|---|---|---|
| 0 | CC101 | 2017-01-01 10:32:03 | C2448 | phone | signup | prd_1 | 2017-01-01 10:35:09 | NaT |
| 1 | CC102 | 2017-01-01 11:35:47 | C2449 | phone | signup | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 |
| 2 | CC103 | 2017-01-01 11:37:09 | C2450 | phone | signup | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 |
| 3 | CC104 | 2017-01-01 13:28:14 | C2451 | phone | signup | prd_2 | 2017-01-01 13:32:08 | NaT |
| 4 | CC105 | 2017-01-01 13:52:22 | C2452 | phone | signup | prd_1 | 2017-01-01 13:57:30 | 2021-06-28 18:06:01 |
# loading data
product_info = pd.read_csv(os.path.join(path_dir, "product_info.csv"))
product_info.head()
| product_id | name | price | billing_cycle | |
|---|---|---|---|---|
| 0 | prd_1 | annual_subscription | 1200 | 12 |
| 1 | prd_2 | monthly_subscription | 125 | 1 |
# merging data
df = df.merge(
product_info.rename(columns={'product_id': 'product'}),
on=['product'], how='left')
df.head()
| case_id | date_time | customer_id | channel | reason | product | signup_date_time | cancel_date_time | name | price | billing_cycle | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CC101 | 2017-01-01 10:32:03 | C2448 | phone | signup | prd_1 | 2017-01-01 10:35:09 | NaT | annual_subscription | 1200 | 12 |
| 1 | CC102 | 2017-01-01 11:35:47 | C2449 | phone | signup | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 | annual_subscription | 1200 | 12 |
| 2 | CC103 | 2017-01-01 11:37:09 | C2450 | phone | signup | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 | annual_subscription | 1200 | 12 |
| 3 | CC104 | 2017-01-01 13:28:14 | C2451 | phone | signup | prd_2 | 2017-01-01 13:32:08 | NaT | monthly_subscription | 125 | 1 |
| 4 | CC105 | 2017-01-01 13:52:22 | C2452 | phone | signup | prd_1 | 2017-01-01 13:57:30 | 2021-06-28 18:06:01 | annual_subscription | 1200 | 12 |
customer_info = pd.read_csv(os.path.join(path_dir, "customer_info.csv"))
customer_info.head()
| Unnamed: 0 | customer_id | age | gender | |
|---|---|---|---|---|
| 0 | 1 | C2448 | 76 | female |
| 1 | 2 | C2449 | 61 | male |
| 2 | 3 | C2450 | 58 | female |
| 3 | 4 | C2451 | 62 | female |
| 4 | 5 | C2452 | 71 | male |
# merging data
df = df.merge(
customer_info.drop('Unnamed: 0',axis=1),
on=['customer_id']
)
df.head()
| case_id | date_time | customer_id | channel | reason | product | signup_date_time | cancel_date_time | name | price | billing_cycle | age | gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CC101 | 2017-01-01 10:32:03 | C2448 | phone | signup | prd_1 | 2017-01-01 10:35:09 | NaT | annual_subscription | 1200 | 12 | 76 | female |
| 1 | CC102 | 2017-01-01 11:35:47 | C2449 | phone | signup | prd_1 | 2017-01-01 11:39:29 | 2021-09-05 10:00:02 | annual_subscription | 1200 | 12 | 61 | male |
| 2 | CC103 | 2017-01-01 11:37:09 | C2450 | phone | signup | prd_1 | 2017-01-01 11:42:00 | 2019-01-13 16:24:55 | annual_subscription | 1200 | 12 | 58 | female |
| 3 | CC104 | 2017-01-01 13:28:14 | C2451 | phone | signup | prd_2 | 2017-01-01 13:32:08 | NaT | monthly_subscription | 125 | 1 | 62 | female |
| 4 | CC4491 | 2017-03-31 12:06:58 | C2451 | phone | support | prd_2 | 2017-01-01 13:32:08 | NaT | monthly_subscription | 125 | 1 | 62 | female |
date_max = max(df.cancel_date_time.max(), df.signup_date_time.max(), df.date_time.max())
date_max
Timestamp('2022-01-01 06:32:53')
df['duration'] = (df['cancel_date_time'] - df['date_time']).dt.days
df['censored'] = df['duration'].isna().astype(int)
df.loc[df.censored==1, "duration"] = (date_max - df.loc[df.censored==1, "date_time"]).dt.days
# deleting data where customers reach out after they unsubscribe.
df = df[df.duration >0]
df.censored.mean()
0.7947466731855306
df['age_bin'] = df.age.apply(lambda x:"[{},{}[".format(x//10*10, (x//10+1)*10))
df['product=prd_1'] = df['product'].map({'prd_1':1, 'prd_2':0})
df['gender=female'] = df.gender.map({'female':1, 'male':0})
df['channel=email'] = df.channel.map({'phone':0, 'email':1})
df['reason=support'] = df.reason.map({'signup':0, 'support':1})
# create a column containing, for a given observation,
# the number of times the client has reach out before
def get_nb_cases(patient_df):
patient_df['nb_cases'] = [i for i in range(patient_df.shape[0])]
return patient_df
df = df.groupby('customer_id').apply(lambda df : get_nb_cases(df))
# yearly seasonlity
df['date_month_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*(x.month-1)/12))
df['date_month_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*(x.month-1)/12))
# weekly seasonlity
df['date_weekday_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*x.dayofweek/6))
df['date_weekday_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*x.dayofweek/6))
# daily seasonlity
df['date_hour_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*x.hour/24))
df['date_hour_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*x.hour/24))
px.histogram(df, x='duration')
px.box(df, y="duration", x='age_bin')
for c in ['channel', 'reason', 'name']:
px.box(df, y="duration", x=c, width=500, height = 500).show()
px.imshow(df.corr())
df.drop(['reason', 'product', 'gender', 'channel'], axis=1, inplace=True)
df.to_csv(os.path.join(os.path.dirname(os.getcwd()), "outputs/customer_subscription_clean.csv"), index=False)